In [24]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
import requests
from datetime import datetime

In [2]:
url = "https://www.zueriwieneu.ch/report/12793"
response = requests.get(url)
züri_soup = BeautifulSoup(response.text, 'html.parser')

Adding Exception


In [40]:
url = "https://www.zueriwieneu.ch/report/3"
response = requests.get(url)
züri_soup = BeautifulSoup(response.text, 'html.parser')

In [13]:
lst_pass


Out[13]:
['https://www.zueriwieneu.ch/report/3']

In [14]:
float(züri_soup.find('div', {'id':'js-map-data'}).get('data-latitude'))


Out[14]:
47.338707

In [15]:
float(züri_soup.find('div', {'id':'js-map-data'}).get('data-longitude'))


Out[15]:
8.524772

Adding Counter


In [20]:
import time
import progressbar

In [21]:
bar = progressbar.ProgressBar()
lst = []
lst_pass = []


for elem,i in zip(range(1,13000), bar((range(1,13000)))):
    
    url = "https://www.zueriwieneu.ch/report/" + str(elem)
    response = requests.get(url)
    züri_soup = BeautifulSoup(response.text, 'html.parser')
    
    if züri_soup.find('h1').text != 'Melden Sie Schäden an der Infrastruktur von Zürich':
        Mini_dict = {
            'Kategorie' : züri_soup.find('h1').text,
            'Meldedatum' : züri_soup.find('div', {'class':'problem-header clearfix'}).find('p').text.strip(),
            'Meldung' : züri_soup.find('div', {'class':'problem-header clearfix'}).find_all('p')[1],
            'Antwortdatum' : züri_soup.find('ul', {'class':'item-list item-list--updates'}).find_all('p')[0].text,
            'Antwort' : züri_soup.find('ul', {'class':'item-list item-list--updates'}).find_all('p')[1].text,
            'URL' : url,
            'Lat' : float(züri_soup.find('div', {'id':'js-map-data'}).get('data-latitude')),
            'Long': float(züri_soup.find('div', {'id':'js-map-data'}).get('data-longitude'))
            }  
        
        lst.append(Mini_dict)
        
    else:
        lst_pass.append(url)


  1% (215 of 12999) |                      | Elapsed Time: 0:05:12 ETA: 6:08:01
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-5c0e884494fa> in <module>()
      7 
      8     url = "https://www.zueriwieneu.ch/report/" + str(elem)
----> 9     response = requests.get(url)
     10     züri_soup = BeautifulSoup(response.text, 'html.parser')
     11 

~/.virtualenvs/master/lib/python3.5/site-packages/requests/api.py in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

~/.virtualenvs/master/lib/python3.5/site-packages/requests/api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

~/.virtualenvs/master/lib/python3.5/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

~/.virtualenvs/master/lib/python3.5/site-packages/requests/sessions.py in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

~/.virtualenvs/master/lib/python3.5/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    438                     decode_content=False,
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )
    442 

~/.virtualenvs/master/lib/python3.5/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    599                                                   timeout=timeout_obj,
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 
    603             # If we're going to release the connection in ``finally:``, then

~/.virtualenvs/master/lib/python3.5/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    344         # Trigger any extra validation we need to do.
    345         try:
--> 346             self._validate_conn(conn)
    347         except (SocketTimeout, BaseSSLError) as e:
    348             # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.

~/.virtualenvs/master/lib/python3.5/site-packages/urllib3/connectionpool.py in _validate_conn(self, conn)
    848         # Force connect early to allow us to validate the connection.
    849         if not getattr(conn, 'sock', None):  # AppEngine might not have  `.sock`
--> 850             conn.connect()
    851 
    852         if not conn.is_verified:

~/.virtualenvs/master/lib/python3.5/site-packages/urllib3/connection.py in connect(self)
    324             ca_cert_dir=self.ca_cert_dir,
    325             server_hostname=hostname,
--> 326             ssl_context=context)
    327 
    328         if self.assert_fingerprint:

~/.virtualenvs/master/lib/python3.5/site-packages/urllib3/util/ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir)
    327         context.load_cert_chain(certfile, keyfile)
    328     if HAS_SNI:  # Platform-specific: OpenSSL with enabled SNI
--> 329         return context.wrap_socket(sock, server_hostname=server_hostname)
    330 
    331     warnings.warn(

/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname)
    374                          suppress_ragged_eofs=suppress_ragged_eofs,
    375                          server_hostname=server_hostname,
--> 376                          _context=self)
    377 
    378     def wrap_bio(self, incoming, outgoing, server_side=False,

/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/ssl.py in __init__(self, sock, keyfile, certfile, server_side, cert_reqs, ssl_version, ca_certs, do_handshake_on_connect, family, type, proto, fileno, suppress_ragged_eofs, npn_protocols, ciphers, server_hostname, _context)
    745                         # non-blocking
    746                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 747                     self.do_handshake()
    748 
    749             except (OSError, ValueError):

/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/ssl.py in do_handshake(self, block)
    981             if timeout == 0.0 and block:
    982                 self.settimeout(None)
--> 983             self._sslobj.do_handshake()
    984         finally:
    985             self.settimeout(timeout)

/usr/local/Cellar/python3/3.5.1/Frameworks/Python.framework/Versions/3.5/lib/python3.5/ssl.py in do_handshake(self)
    626     def do_handshake(self):
    627         """Start the SSL/TLS handshake."""
--> 628         self._sslobj.do_handshake()
    629         if self.context.check_hostname:
    630             if not self.server_hostname:

KeyboardInterrupt: 

In [28]:
date = time.strftime("%Y-%m-%d%H:%M:%S")

In [30]:
pd.DataFrame(lst).to_csv(date+'züriwieneu.csv')

In [ ]: